d
x {0, 1}
d
P (x) =
exp (E(x))
Z
.
E(x) Z
h
1
h
2
h
3
v
1
v
2
v
3
h
4
h
1
(1)
h
2
(1)
h
3
(1)
v
1
v
2
v
3
h
1
(2)
h
2
(2)
h
3
(2)
h
4
(1)
h
1
(1)
h
2
(1)
h
3
(1)
v
1
v
2
v
3
h
1
(2)
h
2
(2)
h
3
(2)
h
4
(1)
x
P (x) = 1
E(x) = x
Ux b
x,
U b
x
d
x
v
x
h
x x
v
x
h
E(x
v
, x
h
) = x
v
Rx
v
x
v
W x
h
x
h
Sx
h
b
x
v
c
x
h
,
n X
v
= [x
(1)
v
, . . . , x
(t)
v
, . . . , x
n
v
]
(θ) = log P (X
v
) =
n
t=1
log P (x
(t)
v
).
P (x
(t)
v
)
x
v
x
h
P (x
(t)
v
) x
h
P (x
(t)
v
) =
x
h
P (x
(t)
v
, x
(t)
h
) =
x
h
1
Z
exp
E(x
(t)
v
, x
(t)
h
)
.
Z
θ
(θ) =
θ
n
t=1
1
Z
log
x
h
exp
E(x
(t)
v
, x
(t)
h
)

=
n
t=1
θ
log
x
h
exp
E(x
(t)
v
, x
(t)
h
)
Z
θ
=
n
t=1
x
h
exp
E(x
(t)
v
, x
(t)
h
)
x
h
exp
E(x
(t)
v
, x
(t)
h
)
θ
E(x
(t)
v
, x
(t)
h
)
Z
θ
D N
D
v
i v
i
N h j
h
j
P (v = v, h = h) =
1
Z
exp {−E(v, h)}.
E(v, h)
E(v, h) = b
v c
h v
W h,
Z
Z =
v
h
exp {−E(v, h)}.
Z Z
Z
Z
[v, h]
P (h | v) P (v | h)
p(h | v) =
p(h, v)
p(v)
=
p(h, v)
p(v)
=
1
p(v)
1
Z
exp
b
v + c
h + v
W h
=
1
Z
exp
c
h + v
W h
=
1
Z
exp
n
j=1
c
j
h
j
+
n
j=1
v
W
:,j
h
j
=
1
Z
n
j=1
exp
c
j
h
j
+ v
W
:,j
h
j
v
p(h | v) p(h | v)
h h
j
h
j
P (h
j
= 1 | v) =
˜
P (h
j
= 1 | v)
˜
P (h
j
= 0 | v) +
˜
P (h
j
= 1 | v)
=
exp
c
j
+ v
W
:,j
exp {0} + exp {c
j
+ v
W
:,j
}
= sigmoid
c
j
+ v
W
:,j
.
P (h | v) =
n
j=1
sigmoid
c
j
+ v
W
:,j
.
P (v |
h)
P (v | h) =
d
i=1
sigmoid (b
i
+ W
i,:
h) .
h
(l)
P(h | v
(l)
)
h
(l)
v
(l)
v
(l+1)
P(v | h
(l)
)
P (v | h
(l)
)
v
(l+1)
h
(l)
n
{v
(1)
, . . . , v
(t)
, . . . , v
(n)
}
b c
W
(W , b, c) =
n
t=1
log P (v
(t)
)
=
n
t=1
log
h
P (v
(t)
n,:
, h)
=
n
t=1
log
h
exp
E(v
(t)
, h)
n log Z
=
n
t=1
log
h
exp
E(v
(t)
, h)
n log
v,h
exp {−E(v, h)}
θ = {b, c, W }
θ
(θ) =
θ
n
t=1
log
h
exp
E(v
(t)
, h)
n
θ
log
v,h
exp {−E(v, h)}
=
n
t=1
h
exp
E(v
(t)
, h)
θ
E(v
(t)
, h)
h
exp
E(v
(t)
, h)
n
v,h
exp {−E(v, h)}
θ
E(v, h)
v,h
exp {−E(v, h)}
=
n
t=1
E
P (h|v
(t)
)
θ
E(v
(t)
, h)
nE
P (v,h)
[
θ
E(v, h)]
P (v) = 1/n
n
t=1
δ(x v
(t)
)
P (h | v
(t)
)
P (v, h)
θ
E(v, h)
W
nabla
W
E(v, h) =
W
b
v + c
h + v
W h
= hv
b c
b
E(v, h) = v,
c
E(v, h) = h
n
W
(W , b, c) =
n
t=1
ˆ
h
(t)
v
(t)
NE
P (v,h)
hv
b
(W , b, c) =
n
t=1
v
(t)
nE
P (v,h)
[v]
c
(W , b, c) =
n
t=1
ˆ
h
(t)
nE
P (v,h)
[h]
ˆ
h
(t)
ˆ
h
(t)
= E
P (h|v
(t)
)
[h] = sigmoid
c + v
(t)
W
.
P (v, h) P (v | h)
P (h | v)
E
P (v,h)
[f(v, h)]
P (v, h)
T
P (v, h)
E
P (v,h)
[f(v, h)]
1
T
T
t=1
f(v
(t)
, h
(t)
).
k p(v; θ)
p
m {v
(1)
, . . . , v
(m)
}
W
1
m
m
t=1
v
(t)
ˆ
h
(t)
b
1
m
m
t=1
v
(t)
c
1
m
m
t=1
ˆ
h
(t)
t = 1 m
˜
v
(t)
v
(t)
l = 1 k
t = 1 m
˜
h
(t)
n
j=1
sigmoid
c
j
+
˜
v
(t)
W
:,j
˜
v
(t)
d
i=1
sigmoid
b
i
+ W
i,:
˜
h
(t)
¯
h
(t)
sigmoid
c +
˜
v
(t)
W
W
W
1
m
m
t=1
˜
v
(t)
¯
h
(t)
b
b
1
m
m
t=1
˜
v
(t)
c
b
1
m
m
t=1
¯
h
(t)
W W +
W
b b +
b
c c +
c
k
s
s 1
s
k
p(v, h; θ +
θ
) p(v, h; θ)
m {
˜
v
(1)
, . . . ,
˜
v
(m)
}
m {v
(1)
, . . . , v
(m)
}
W
1
m
m
t=1
ˆ
h
(t)
v
(t)
b
1
m
m
t=1
v
(t)
c
1
m
m
t=1
ˆ
h
(t)
l = 1 k
t = 1 m
˜
h
(t)
n
j=1
sigmoid
c
j
+
˜
v
(t)
W
:,j
˜
v
(t)
d
i=1
sigmoid
b
i
+ W
i,:
˜
h
(t)
W
W
1
m
m
t=1
˜
v
(t)
˜
h
(t)
b
b
1
m
m
t=1
˜
v
(t)
c
b
1
m
m
t=1
˜
h
(t)
W W +
W
b b +
b
c c +
c
L L W
(1)
, . . . , W
(L)
L + 1 b
(0)
, . . . , b
(L)
b
(0)
p(h
(L)
, h
(L1)
) exp
b
(L)
h
(L)
+ b
(L1)
h
(L1)
+ h
(L1)
W
(L)
h
(L)
,
p(h
(l)
i
= 1 | h
(l+1)
) = σ
b
(l)
i
+ W
(l+1)
:,i
h
(l+1)
i, l 1, . . . , L 2,
p(v
i
= 1 | h
(1)
) = σ
b
(0)
i
+ W
(1)
:,i
h
(1)
i.
v N
v | b
(0)
+ W
(1)
h
(1)
, β
1
β
E
v
h
(1)
, h
(2)
andh
(3)
P
v, h
(1)
, h
(2)
, h
(3)
=
1
Z(θ)
exp
E(v, h
(1)
, h
(2)
, h
(3)
; θ)
.
E(v, h
(1)
, h
(2)
, h
(3)
; θ) = v
W
(1)
h
(1)
h
(1)
W
(2)
h
(2)
h
(2)
W
(3)
h
(3)
.
W
(2)
W
(3)
h
1
(1)
h
2
(1)
h
3
(1)
v
1
v
2
v
3
h
1
(2)
h
2
(2)
h
3
(2)
h
4
(1)
P (h
(1)
= 1 | v, h
(2)
)
P (h
(1)
| v, h
(2)
) =
P (h
(1)
, v, h
(2)
)
P (v, h
(2)
)
=
exp
v
W
(1)
h
(1)
+ h
(1)
W
(2)
h
(2)
1
h
(1)
1
=0
···
1
h
(1)
n
=0
exp
v
W
(1)
h
(1)
+ h
(1)
W
(2)
h
(2)
=
exp
v
W
(1)
h
(1)
+ h
(1)
W
(2)
h
(2)
1
h
(1)
1
=0
···
1
h
(1)
n
=0
exp
v
W
(1)
h
(1)
+ h
(1)
W
(2)
h
(2)
=
exp
n
j=1
v
W
(1)
:,j
h
(1)
j
+ h
(1)
j
W
(2)
j,:
h
(2)
1
h
(1)
1
=0
···
1
h
(1)
n
=0
exp
n
j
=1
v
W
(1)
:,j
h
(1)
j
+ h
(1)
j
W
(2)
j
,:
h
(2)
=
j
exp
v
W
(1)
:,j
h
(1)
j
+ h
(1)
j
W
(2)
j,:
h
(2)
1
h
(1)
1
=0
···
1
h
(1)
n
=0
j
exp
v
W
(1)
:,j
h
(1)
j
+ h
(1)
j
W
(2)
j
,:
h
(2)
=
j
exp
v
W
(1)
:,j
h
(1)
j
+ h
(1)
j
W
(2)
j,:
h
(2)
1
h
(1)
j
=0
exp
v
W
(1)
:,j
h
(1)
j
+ h
(1)
j
W
(2)
j,:
h
(2)
=
j
exp
v
W
(1)
:,j
h
(1)
j
+ h
(1)
j
W
(2)
j,:
h
(2)
1 + exp
v
W
(1)
:,j
+ W
(2)
j,:
h
(2)
=
j
P (h
(1)
j
| v, h
(2)
).
P (h
(1)
j
= 1 | v, h
(2)
) =
exp
v
W
(1)
:,j
+ W
(2)
j,:
h
(2)
1 + exp
v
W
(1)
:,j
+ W
(2)
j,:
h
(2)
=
1
1 + exp
v
W
(1)
:,j
W
(2)
j,:
h
(2)
= sigmoid
v
W
(1)
:,j
+ W
(2)
j,:
h
(2)
.
v, h
(2)
P (v | h
(1)
) =
d
i=1
P (v
i
| h
(1)
)
P (v
i
= 1 | h
(1)
) = sigmoid
W
(1)
i,:
h
(1)
.
P (h
(2)
| h
(1)
) =
m
k=1
P (h
(2)
k
| h
(1)
)
P (h
(2)
k
= 1 | h
(1)
) = sigmoid
h
(1)
W
(2)
:,k
.
P (v | h
(1)
) P (h
(1)
|
v, h
(2)
) P (h
(2)
| h
(1)
)
P (h
(1)
, h
(2)
| v)
W
(2)
h
(1)
h
(2)
v
Q(h
(1)
, h
(2)
| v) P (h
(1)
, h
(2)
| v)
Q(h
(1)
, h
(2)
| v) =
n
j=1
Q(h
(1)
j
| v)
m
k=1
Q(h
(2)
k
| v).
P (h
(1)
, h
(2)
| v)
Q
P KL(QP )
KL(QP ) =
h
Q(h
(1)
, h
(2)
| v) log
Q(h
(1)
, h
(2)
| v)
P (h
(1)
, h
(2)
| v)
Q
h
(1)
j {1, . . . , n}
ˆ
h
(1)
j
= P(h
(1)
j
= 1)
ˆ
h
(1)
j
[0, 1]
k {1, . . . , m}
ˆ
h
(2)
k
= P(h
(2)
k
= 1)
ˆ
h
(2)
k
[0, 1]
Q(h
(1)
, h
(2)
| v) =
n
j=1
Q(h
(1)
j
| v)
m
k=1
Q(h
(2)
k
| v)
=
n
j=1
(
ˆ
h
(1)
j
)
h
(1)
j
(1
ˆ
h
(1)
j
)
(1h
(1)
j
)
×
m
k=1
(
ˆ
h
(2)
k
)
h
(2)
k
(1
ˆ
h
(2)
k
)
(1h
(2)
k
)
Q
P KL(QP )
Q Q
KL(QP )
L(Q) =
h
(1)
,h
(2)
Q(h
(1)
, h
(2)
| v) log
P (v, h
(1)
, h
(2)
; θ)
q(h
(1)
, h
(2)
| v)
=
h
(1)
,h
(2)
Q(h
(1)
, h
(2)
| v)E(v, h
(1)
, h
(2)
; θ) log Z(θ) + H(Q),
Z(θ) H(Q)
Q(h
(1)
, h
(2)
| v)
Q(h
(1)
, h
(2)
| v)
L(q) =
i
j
v
i
W
(1)
ij
ˆ
h
(1)
j
+
j
k
ˆ
h
(1)
j
W
(2)
j
k
ˆ
h
(2)
k
ln Z(θ) + H(q).
ˆ
h
(1)
j
L(q) = 0 j {1, . . . , n},
ˆ
h
(2)
k
L(q) = 0 k {1, . . . , m}
ˆ
h
(1)
j
ˆ
h
(1)
j
L(q) =
ˆ
h
(1)
j
i
j
v
i
W
(1)
ij
ˆ
h
(1)
j
+
j
k
ˆ
h
(1)
j
W
(2)
j
k
ˆ
h
(2)
k
ln Z(θ) + H(q)
=
ˆ
h
(1)
j
i
j
v
i
W
(1)
ij
ˆ
h
(1)
j
+
j
k
ˆ
h
(1)
j
W
(2)
j
k
ˆ
h
(2)
k
ln Z(θ)
j
ˆ
h
(1)
j
ln
ˆ
h
(1)
j
+ (1
ˆ
h
(1)
j
) ln(1
ˆ
h
(1)
j
)
k
ˆ
h
(2)
k
ln
ˆ
h
(2)
k
+ (1
ˆ
h
(2)
k
) ln(1
ˆ
h
(2)
k
)
=
i
v
i
W
(1)
ij
+
k
W
(2)
jk
ˆ
h
(2)
k
ln
ˆ
h
(1)
j
1
ˆ
h
(1)
j
,
H(q)
ˆ
h
(1)
j
ˆ
h
(1)
j
L(q) = 0 =
i
v
i
W
(1)
ij
+
k
W
(2)
jk
ˆ
h
(2)
k
ln
ˆ
h
(1)
j
1
ˆ
h
(1)
j
ˆ
h
(1)
j
= sigmoid
i
v
i
W
(1)
ij
+
k
W
(2)
jk
ˆ
h
(2)
k
ˆ
h
(1)
j
= sigmoid
i
v
i
W
(1)
ij
+
k
W
(2)
jk
ˆ
h
(2)
k
, j
ˆ
h
(2)
k
= sigmoid
j
W
(2)
j
k
ˆ
h
(1)
j
, k
L(q)
ˆ
h
(1)
j
ˆ
h
(2)
k
ˆ
h
(1)
j
ˆ
h
(2)
k
10
P (h
(1)
, h
(2)
| v)
L(Q, θ) =
i
j
v
i
W
(1)
ij
ˆ
h
(1)
j
+
j
k
ˆ
h
(1)
j
W
(2)
j
k
ˆ
h
(2)
k
ln Z(θ) + H(Q).
P (v | θ)
L(Q, θ)
L(Q, θ)
ˆ
h
(1)
ˆ
h
(2)
L(Q, θ) θ
P (v | θ)
L(Q, θ)
L(Q, θ) θ Q
θ
L(Q, θ) = 0 θ
θ
L(Q, θ)
θ
L(Q, θ) =
θ
i
j
v
i
W
(1)
ij
ˆ
h
(1)
j
+
j
k
ˆ
h
(1)
j
W
(2)
j
k
ˆ
h
(2)
k
ln Z(θ) + H(Q)
=
θ
i
j
v
i
W
(1)
ij
ˆ
h
(1)
j
+
j
k
ˆ
h
(1)
j
W
(2)
j
k
ˆ
h
(2)
k
θ
ln Z(θ)
ˆ
h
(1)
ˆ
h
(2)
nE
P (v,h
(1)
,h
(2)
)
θ
E(v, h
(1)
, h
(2)
)
Q(h | v, θ
(t)
) = P (h |
v, θ
(t)
) L(Q, θ) = P (v | θ)
P (h | v)
W
(1)
W
(2)
c)
d)
b)
a)
logP (v)
h
(1)
k log P (h
(1)
, )
h
(1)
k
logP (v, ) k = 5
h
(1)
h
(
2)
log P ( | v)
p
Z
p(h | v)
P (h | v)
h
s
(h s)W
W
:,i
i
i
W
:,i
h s
n d
p = max
i i
2
n
3 × 3
2
9
= 512
n + 1
n
n + 1 n + 1
n + 1
E
Z
L
z x g(z)
p(z)
g(z)
p(x | z) = N(x | g
z),
1
β
I
,
β g(z)
g d
x
x
d g
d
g
= arg min
g
max
d
V (g, d)
v(g, d) = E
xp
log d(x) + E
x
p
log (1 d(x)) .
g d
g
max
d
v(g, d)
d
g
P (x
t
| x
t1
, . . . , x
1
)
k
k
x
tk
x
t
x
1
x
2
x
3
x
4
P(x
4
|x
1
,x
2
,x
3
)
P(x
3
|x
1
,x
2
)
P(x
2
|x
1
)
P(x
1
)
x
1
x
2
x
3
x
4
i i1
i
i 1 i 1
P (x
t
| x
t1
, . . . , x
1
)
O(T
2
) T
x
1
x
2
x
3
x
4
h
1
h
2
h
3
P(x
4
|x
1
,x
2
,x
3
)
P(x
3
|x
1
,x
2
)
P(x
2
|x
1
)
P(x
1
)
i x
i
i 1
h
i
x
1
, . . . , x
i
x
i+1
, x
i+2
, . . .
P (x
t
| x
t1
, . . . , x
1
)
(t 1) × k k k
x
t
x
t
x
t+k
k > 0
t
x
1
, . . . , x
t
x
t+1
, x
t+2
, . . .
P (x
t
| x
t1
, . . . , x
1
)
x
t
j W
jki
i x
i
k
j h
jk
j i
W
jki
= W
ki
.
k
P (x
j
| x
j1
, . . . , x
1
)
n! n o p(x | o)
o
p (x) =
1
k
k
i=1
p(x | o).
O(nh)
h
h
i
O(n
2
h)
O(n
2
h
2
)
l
l +1 n h i
l + 1 i
l O(nh
2
) h
ω
t
= g(h
t
)
h
t
= fx
t
)
f
g
P (X|ω)
C(
˜
X|X)
x
t+1
C x
˜
x f h = f(
˜
x)
g ω ω
P (x | ω = g(f(
˜
x)))
g(h) =
ˆ
x E[x |
˜
x]
P (x | ω)
ˆ
x
C P
f g
x
˜
x
C(
˜
x | x)
˜
x h = f(
˜
x)
h ω = g(h) P (x | ω = g(h)) = P (x |
˜
x)
x P (x | ω = g(h)) = P (x |
˜
x)
P (x |
˜
x)
x
P (x
f
| x
o
)
x
f
x
o
x
f
k
X
0
H
1
X
1
H
2
H
3
X
2
H
0
W
1
W
1
W
1
W
2
W
2
W
2
W
3
W
3
X
1
H
1
X
2
X
3
H
2
H
3
W
3
X
0
data target target target
x
X
2
X
0
X
1
H
0
H
1
H
2
H X X
k
H
k
k
P (X
k
| H
k
)
P (H
k
| H
k1
, X
k1
)
X
0
= x x
log P (X
k
= x | H
k
) H
k
X
0
= x
log P (X
k
= x | H
k
)
P (x) x
P (y | x)
P (y | x)
y x
X
X
y x
log Z log ˜p(x) log Z
Z
log p(x)
k
p(v, h
(1)
, h
(2)
; θ +
θ
) p(v, h
(1)
, h
(2)
; θ)
˜
V
˜
H
(1)
˜
H
(2)
m
m
V =
v
(1)
, . . . , v
(m)
˜
H
(1)
sigmoid
V
W
(1)
+
˜
H
(2)
W
(2)
˜
H
(2)
sigmoid
˜
H
(1)
W
(2)
W
(1)
1
m
V
ˆ
H
(1)
W
(2)
1
m
ˆ
H
(1)
ˆ
H
(2)
l = 1 k
˜
V
m
i=1
d
a=1
sigmoid
W
(1)
a,:
˜
H
(2)
:,i
˜
H
(2)
m
i=1
m
b=1
sigmoid
˜
H
(1)
:,i
W
(2)
:,b
˜
H
(1)
m
i=1
n
j=1
sigmoid
˜
V
:,i
W
(1)
:,j
+ W
(2)
j,:
˜
H
(2)
:,i
W
(1)
W
(1)
1
m
V
ˆ
H
(1)
W
(2)
W
(2)
1
m
ˆ
H
(1)
ˆ
H
(2)
W
(1)
W
(1)
+
W
(1)
W
(2)
W
(2)
+
W
(2)